Project Source Codes

Crash Clarity: Data-Driven Insights for Enhancing UK Road Safety using statistical models

Loading Libraries:

# Load required libraries

suppressWarnings({
 library(tidyverse)
library(ggplot2)
library(plotly)  
library(maps)
library(leaflet)
library(readr)

library(plotly)
library(dplyr)
library(viridis) 
library(rnaturalearth)  # For loading world map data


})
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(plotly)

# Load the data
road_casualities <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Final project\\casualty_statistics.csv")

# Data Cleaning Steps
# 1. Remove rows with missing or NA values in important columns
road_casualities <- road_casualities %>%
  filter(!is.na(accident_severity), 
         !is.na(weather_conditions), 
         !is.na(road_surface_conditions))

# 2. Ensure the accident_severity column has valid values (1, 2, 3)
road_casualities <- road_casualities %>%
  filter(accident_severity %in% c(1, 2, 3))

# 3. Convert necessary columns to factors
road_casualities$accident_severity <- factor(road_casualities$accident_severity, 
                                 levels = c(1, 2, 3),
                                 labels = c("Life-Threatening", "Significant", "Mild"))
road_casualities$weather_conditions <- as.factor(road_casualities$weather_conditions)
road_casualities$road_surface_conditions <- as.factor(road_casualities$road_surface_conditions)
road_casualities$urban_or_rural_area <- as.factor(road_casualities$urban_or_rural_area)

# 4. Remove duplicates if any
road_casualities <- road_casualities %>% distinct()
# Visualization with Cleaned Data
# Assign updated custom colors for each severity level
severity_colors <- c("Life-Threatening" = "red", 
                     "Significant" = "darkorange", 
                     "Mild" = "darkgreen")

# Create an Interactive Histogram with Updated Colors
histogram <- ggplot(road_casualities, aes(x = accident_severity, fill = accident_severity)) +
  geom_bar(alpha = 0.7, color = "black") +
  scale_fill_manual(values = severity_colors) +
  labs(title = "Distribution of Accident Severity",
       x = "Accident Severity",
       y = "Count") 

# Convert to interactive using plotly
interactive_histogram <- ggplotly(histogram, tooltip = c("count", "x"))

# Display the interactive histogram
interactive_histogram
library(ggplot2)
library(plotly)

# Load dataset
road_casualities <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Final project\\casualty_statistics.csv")

# Convert time column to a usable format (assuming time is in 24-hour format)
road_casualities$time_of_day <- as.numeric(substr(road_casualities$time, 1, 2))

# Define time bands based on STATS20 guidance
road_casualities$time_band <- cut(
  road_casualities$time_of_day,
  breaks = c(-1, 5, 9, 15, 19, 23),
  labels = c("Night (Midnight to 5 AM)", 
             "Morning Rush Hour", 
             "Daytime", 
             "Evening Rush Hour", 
             "Night (8 PM to 11 PM)")
)

# Aggregate the data by time bands
time_band_summary <- road_casualities %>%
  group_by(time_band) %>%
  summarise(total_accidents = n()) %>%
  arrange(desc(total_accidents))

# Create a ggplot object with gradient colors
time_band_plot <- ggplot(time_band_summary, aes(x = time_band, y = total_accidents, fill = total_accidents)) +
  geom_bar(stat = "identity", color = "black", alpha = 0.8) +
  labs(
    title = "Accidents by Time Bands (STATS20)",
    x = "Time Band",
    y = "Number of Accidents"
  ) +
  scale_fill_gradient(low = "pink", high = "red", name = "Total Accidents") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert the plot to interactive using plotly
interactive_time_band_plot <- ggplotly(time_band_plot, tooltip = c("x", "y"))

# Display the interactive plot
interactive_time_band_plot
library(ggplot2)
library(plotly)
library(dplyr)

# Define light condition mapping (STATS20)
light_conditions_map <- c(
  "1" = "Daylight",
  "4" = "Dark (street lights present and lit)",
  "5" = "Dark (street lights present but unlit)",
  "6" = "Dark (no street lights)",
  "7" = "Other"
)

# Map light conditions
road_casualities$light_conditions <- as.factor(
  light_conditions_map[as.character(road_casualities$light_conditions)]
)

# Ensure weather conditions are mapped
weather_conditions_map <- c(
  "1" = "Fine without high winds",
  "2" = "Raining without high winds",
  "3" = "Snowing without high winds",
  "4" = "Fine with high winds",
  "5" = "Raining with high winds",
  "6" = "Snowing with high winds",
  "7" = "Fog or mist",
  "8" = "Other",
  "9" = "Unknown"
)

road_casualities$weather_conditions <- as.factor(
  weather_conditions_map[as.character(road_casualities$weather_conditions)]
)

# Create summary data for heatmap
heatmap_data <- road_casualities %>%
  group_by(weather_conditions, light_conditions) %>%
  summarise(total_accidents = n(), .groups = "drop")

# Create the heatmap with data labels and contrasting colors
heatmap_plot <- ggplot(heatmap_data, aes(x = weather_conditions, y = light_conditions, fill = total_accidents)) +
  geom_tile(color = "white") +
  geom_text(aes(label = total_accidents), color = "black", size = 3) +  # Add data labels
  scale_fill_gradient(low = "#f9f9f9", high = "#d73027", name = "Total Accidents") +
  labs(
    title = "Accidents by Weather and Light Conditions",
    x = "Weather Conditions",
    y = "Light Conditions"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.text.y = element_text(size = 10),
    plot.title = element_text(size = 14, face = "bold"),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8)
  )

# Make the heatmap interactive
interactive_heatmap <- ggplotly(heatmap_plot, tooltip = c("x", "y", "fill"))
interactive_heatmap

`

# Load the dataset
road_casualities <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Final project\\casualty_statistics.csv")

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(plotly)

# Define STATS19 road surface condition mapping
road_conditions_map <- c(
  "1" = "Dry",
  "2" = "Wet or damp",
  "3" = "Snow",
  "4" = "Frost or ice",
  "5" = "Flood",
  "6" = "Oil or diesel",
  "7" = "Mud",
  "8" = "Other",
  "9" = "Unknown"
)

# Step 1: Filter valid road_surface_conditions values (1 to 9)
road_casualities <- road_casualities %>%
  filter(road_surface_conditions %in% c(1:9))

# Step 2: Map road conditions to descriptions
road_casualities$road_conditions_desc <- as.factor(
  road_conditions_map[as.character(road_casualities$road_surface_conditions)]
)

# Debugging: Check unique values in road_conditions_desc
print("Mapped descriptions:")
[1] "Mapped descriptions:"
print(unique(road_casualities$road_conditions_desc))
[1] Wet or damp  Dry          Unknown      Frost or ice Snow        
[6] Flood       
Levels: Dry Flood Frost or ice Snow Unknown Wet or damp
# Step 3: Aggregate data by road surface condition
road_summary <- road_casualities %>%
  group_by(road_conditions_desc) %>%
  summarise(total_accidents = n()) %>%
  arrange(desc(total_accidents))

# Debugging: Check aggregated data
print("Aggregated road surface condition data:")
[1] "Aggregated road surface condition data:"
print(road_summary)
# A tibble: 6 × 2
  road_conditions_desc total_accidents
  <fct>                          <int>
1 Dry                            72752
2 Wet or damp                    26944
3 Unknown                         1617
4 Frost or ice                    1461
5 Snow                             241
6 Flood                            179
# Step 4: Create the bar plot
road_plot <- ggplot(road_summary, aes(
  x = reorder(road_conditions_desc, -total_accidents), 
  y = total_accidents, 
  fill = road_conditions_desc, 
  text = paste0(road_conditions_desc, ": ", total_accidents, " accidents")
)) +
  geom_bar(stat = "identity", alpha = 0.8, color = "black") +
  labs(
    title = "Distribution of Accidents by Road Surface Conditions (STATS19)",
    x = "Road Surface Conditions",
    y = "Number of Accidents"
  ) +
  scale_fill_manual(values = c(
    "Dry" = "red",
    "Wet or damp" = "#73a2c6",
    "Snow" = "#1b9e77",
    "Frost or ice" = "#d95f02",
    "Flood" = "#7570b3",
    "Oil or diesel" = "#e7298a",
    "Mud" = "#66a61e",
    "Other" = "#e6ab02",
    "Unknown" = "#a6761d"
  )) 

# Convert the plot to an interactive plot
interactive_road_plot <- ggplotly(road_plot, tooltip = "text")

# Display the interactive plot
interactive_road_plot
# Load necessary libraries
library(caret)
# Load necessary libraries
library(nnet)  # For multinomial logistic regression
library(caret) # For data partitioning


# Read the dataset
data <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Final project\\casualty_statistics.csv")


# Select relevant columns and remove rows with missing values
data_cleaned <- na.omit(data[, c("accident_severity", "weather_conditions")])

# Convert columns to factors
data_cleaned$accident_severity <- as.factor(data_cleaned$accident_severity)
data_cleaned$weather_conditions <- as.numeric(data_cleaned$weather_conditions)

# Split the dataset into training and testing sets
set.seed(42)
train_index <- createDataPartition(data_cleaned$accident_severity, p = 0.8, list = FALSE)
train_data <- data_cleaned[train_index, ]
test_data <- data_cleaned[-train_index, ]

# Train multinomial logistic regression model
multinom_model <- multinom(accident_severity ~ weather_conditions, data = train_data)
# weights:  9 (4 variable)
initial  value 91633.053773 
iter  10 value 50406.679444
final  value 50392.493956 
converged
# Summarize the model
summary(multinom_model)
Call:
multinom(formula = accident_severity ~ weather_conditions, data = train_data)

Coefficients:
  (Intercept) weather_conditions
2     2.64414         0.06126746
3     3.76240         0.12285242

Std. Errors:
  (Intercept) weather_conditions
2  0.04240724         0.02156981
3  0.04154713         0.02120177

Residual Deviance: 100785 
AIC: 100793 
# Make predictions
predictions <- predict(multinom_model, newdata = test_data)

# Evaluate the model
confusion_matrix <- table(test_data$accident_severity, predictions)
print("Confusion Matrix:")
[1] "Confusion Matrix:"
print(confusion_matrix)
   predictions
        1     2     3
  1     0     0   304
  2     0     0  4687
  3     0     0 15859
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy, 4)))
[1] "Accuracy: 0.7606"
# Load necessary libraries
library(nnet)  # For multinomial logistic regression
library(caret) # For data partitioning
library(pROC)  # For AUC and ROC curves

# Read the dataset
data <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Final project\\casualty_statistics.csv")

# Select relevant columns and remove rows with missing values
data_cleaned <- na.omit(data[, c("accident_severity", "weather_conditions")])

# Convert columns to factors and numeric types
data_cleaned$accident_severity <- as.factor(data_cleaned$accident_severity)
data_cleaned$weather_conditions <- as.numeric(data_cleaned$weather_conditions)

# Split the dataset into training and testing sets
set.seed(42)
train_index <- createDataPartition(data_cleaned$accident_severity, p = 0.8, list = FALSE)
train_data <- data_cleaned[train_index, ]
test_data <- data_cleaned[-train_index, ]

# Train multinomial logistic regression model
multinom_model <- multinom(accident_severity ~ weather_conditions, data = train_data)
# weights:  9 (4 variable)
initial  value 91633.053773 
iter  10 value 50406.679444
final  value 50392.493956 
converged
# Predict probabilities for the test data
probabilities <- predict(multinom_model, newdata = test_data, type = "probs")

# Create one-vs-all ROC curves for each class
roc_curve_1 <- roc(as.numeric(test_data$accident_severity == 1), probabilities[, 1], plot = TRUE, col = "red", main = "ROC Curve for Multinomial Logistic Regression")
roc_curve_2 <- roc(as.numeric(test_data$accident_severity == 2), probabilities[, 2], plot = TRUE, col = "blue", add = TRUE)
roc_curve_3 <- roc(as.numeric(test_data$accident_severity == 3), probabilities[, 3], plot = TRUE, col = "green", add = TRUE)

# Add legend
legend("bottomright", legend = c("Class 1 (Fatal)", "Class 2 (Serious)", "Class 3 (Slight)"),
       col = c("red", "blue", "green"), lwd = 2)

# Compute AUC for each class
auc_1 <- auc(roc_curve_1)
auc_2 <- auc(roc_curve_2)
auc_3 <- auc(roc_curve_3)
# Load necessary libraries
# Load necessary libraries
library(randomForest)
library(pROC)
library(caret)

# Read the dataset
data <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Final project\\casualty_statistics.csv")

# Select relevant features and target variable
features <- c("weather_conditions", "number_of_vehicles", "road_surface_conditions", "urban_or_rural_area")
target <- "accident_severity"

# Filter data to include only relevant columns and remove rows with missing values
data_cleaned <- na.omit(data[, c(features, target)])

# Convert target variable to factor for classification
data_cleaned$accident_severity <- as.factor(data_cleaned$accident_severity)

# Split data into training and testing sets
set.seed(42)
train_index <- createDataPartition(data_cleaned$accident_severity, p = 0.8, list = FALSE)
train_data <- data_cleaned[train_index, ]
test_data <- data_cleaned[-train_index, ]

# Train a Random Forest model
rf_model <- randomForest(
  accident_severity ~ .,
  data = train_data,
  ntree = 100,
  importance = TRUE
)

# Print model summary
print(rf_model)

Call:
 randomForest(formula = accident_severity ~ ., data = train_data,      ntree = 100, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 100
No. of variables tried at each split: 2

        OOB estimate of  error rate: 23.95%
Confusion matrix:
  1  2     3  class.error
1 0  0  1218 1.0000000000
2 0  6 18745 0.9996800171
3 0 15 63424 0.0002364476
# Predict probabilities on the test set
rf_probabilities <- predict(rf_model, newdata = test_data, type = "prob")

# Draw AUC Curve for each class using one-vs-all approach
auc_values <- list()
for (i in 1:ncol(rf_probabilities)) {
  class_label <- colnames(rf_probabilities)[i]
  roc_curve <- roc(as.numeric(test_data$accident_severity == class_label), 
                   rf_probabilities[, i], 
                   plot = TRUE, 
                   main = paste("ROC Curve for Class", class_label), 
                   col = i)
  auc_values[[class_label]] <- auc(roc_curve)
  print(paste("AUC for Class", class_label, ":", round(auc(roc_curve), 4)))
}

[1] "AUC for Class 1 : 0.4996"

[1] "AUC for Class 2 : 0.5012"

[1] "AUC for Class 3 : 0.5012"
# Visualize feature importance
varImpPlot(rf_model)

only Logistic regression with 2 variables

# Load necessary libraries
library(nnet)    # For logistic regression
library(pROC)    # For ROC curve and AUC
library(caret)   # For data partitioning

# Read the dataset
data <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Final project\\casualty_statistics.csv")

# Select relevant columns and remove rows with missing values
data_cleaned <- na.omit(data[, c("accident_severity", "urban_or_rural_area", "weather_conditions")])

# Create a binary target variable (1 for Slight, 0 for Fatal and Serious)
data_cleaned$binary_severity <- ifelse(data_cleaned$accident_severity == 3, 1, 0)

# Convert necessary columns to numeric/factor
data_cleaned$binary_severity <- as.factor(data_cleaned$binary_severity)
data_cleaned$urban_or_rural_area <- as.numeric(data_cleaned$urban_or_rural_area)
data_cleaned$weather_conditions <- as.numeric(data_cleaned$weather_conditions)

# Split the dataset into training and testing sets
set.seed(42)
train_index <- createDataPartition(data_cleaned$binary_severity, p = 0.8, list = FALSE)
train_data <- data_cleaned[train_index, ]
test_data <- data_cleaned[-train_index, ]

# Train logistic regression model
logistic_model <- glm(binary_severity ~ urban_or_rural_area + weather_conditions, 
                      data = train_data, 
                      family = binomial)
print(logistic_model)

Call:  glm(formula = binary_severity ~ urban_or_rural_area + weather_conditions, 
    family = binomial, data = train_data)

Coefficients:
        (Intercept)  urban_or_rural_area   weather_conditions  
            1.60582             -0.40781              0.06153  

Degrees of Freedom: 83406 Total (i.e. Null);  83404 Residual
Null Deviance:      91810 
Residual Deviance: 91030    AIC: 91040
# Predict probabilities for the test data
probabilities <- predict(logistic_model, newdata = test_data, type = "response")

# Create the ROC curve
roc_curve <- roc(test_data$binary_severity, probabilities, plot = TRUE, col = "blue", 
                 main = "ROC Curve for Binary Classification")

# Compute and print AUC
auc_value <- auc(roc_curve)
print(paste("AUC Value:", round(auc_value, 4)))
[1] "AUC Value: 0.5498"
# Add thresholds to the ROC plot
plot(roc_curve, print.auc = TRUE, col = "blue")

Logistic regression Lasso wirth 2 combination variables

library(glmnet)

# Prepare data for glmnet
X <- as.matrix(train_data[, c("urban_or_rural_area", "weather_conditions")])
y <- as.numeric(train_data$binary_severity) - 1

# Train Lasso logistic regression
lasso_model <- cv.glmnet(X, y, family = "binomial", alpha = 1)

# Predict probabilities for test data
test_X <- as.matrix(test_data[, c("urban_or_rural_area", "weather_conditions")])
lasso_probabilities <- predict(lasso_model, newx = test_X, s = "lambda.min", type = "response")

# Evaluate using ROC and AUC
lasso_roc <- roc(as.numeric(test_data$binary_severity), lasso_probabilities, plot = TRUE, col = "blue",
                 main = "ROC Curve for Lasso Logistic Regression")

lasso_auc <- auc(lasso_roc)
print(paste("Lasso Logistic Regression AUC:", round(lasso_auc, 4)))
[1] "Lasso Logistic Regression AUC: 0.5514"

Logistic regression Lasso wirth more variables

# Load necessary libraries
library(glmnet)
library(pROC)

# Load the dataset
data <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Final project\\casualty_statistics.csv")

# Define selected features and target variable
selected_features <- c("accident_severity", "number_of_vehicles", "road_surface_conditions", 
                       "weather_conditions", "urban_or_rural_area", "special_conditions_at_site", 
                       "day_of_week", "time", "junction_detail", "speed_limit")

# Check if selected features exist in the dataset
if (!all(selected_features %in% colnames(data))) {
  missing_features <- selected_features[!selected_features %in% colnames(data)]
  stop(paste("The following features are missing from the dataset:", paste(missing_features, collapse = ", ")))
}

# Filter the dataset for selected features and remove rows with missing values
data_cleaned <- na.omit(data[, selected_features])

# Create a binary target variable for classification
# Severe (1) = accident_severity 1 or 2, Slight (0) = accident_severity 3
data_cleaned$binary_severity <- ifelse(data_cleaned$accident_severity == 3, 0, 1)

# Feature engineering: Convert 'time' into 'time_of_day' (e.g., Morning, Afternoon, Night)
data_cleaned$time <- as.numeric(sub("^(\\d{2}):.*$", "\\1", data_cleaned$time)) # Extract hour
data_cleaned$time_of_day <- cut(data_cleaned$time, 
                                breaks = c(-1, 6, 12, 18, 24), 
                                labels = c("Night", "Morning", "Afternoon", "Evening"))

# Drop unnecessary columns (original 'time' and 'accident_severity')
data_cleaned <- data_cleaned[, !(names(data_cleaned) %in% c("time", "accident_severity"))]

# Convert categorical variables into factors
categorical_vars <- c("road_surface_conditions", "weather_conditions", "urban_or_rural_area", 
                      "special_conditions_at_site", "day_of_week", "time_of_day", "junction_detail")
data_cleaned[categorical_vars] <- lapply(data_cleaned[categorical_vars], as.factor)

# Prepare data for glmnet
X <- model.matrix(binary_severity ~ ., data_cleaned)[, -1] # Remove intercept
y <- data_cleaned$binary_severity

# Split data into training and testing sets
set.seed(42)
train_index <- sample(1:nrow(X), size = 0.8 * nrow(X))
train_X <- X[train_index, ]
train_y <- y[train_index]
test_X <- X[-train_index, ]
test_y <- y[-train_index]

# Train Lasso Logistic Regression model
lasso_model <- cv.glmnet(train_X, train_y, family = "binomial", alpha = 1)

# Predict probabilities for test data
lasso_probabilities <- predict(lasso_model, newx = test_X, s = "lambda.min", type = "response")

# Evaluate using ROC and AUC
lasso_roc <- roc(test_y, lasso_probabilities, plot = TRUE, col = "blue",
                 main = "ROC Curve for Lasso Logistic Regression")

lasso_auc <- auc(lasso_roc)
print(paste("Lasso Logistic Regression AUC:", round(lasso_auc, 4)))
[1] "Lasso Logistic Regression AUC: 0.612"
# Feature importance (coefficients)
coefficients <- coef(lasso_model, s = "lambda.min")
print("Selected Features and Coefficients:")
[1] "Selected Features and Coefficients:"
print(coefficients)
48 x 1 sparse Matrix of class "dgCMatrix"
                                      s1
(Intercept)                 -1.159394968
number_of_vehicles          -0.256295633
road_surface_conditions1     0.937290733
road_surface_conditions2     0.925376470
road_surface_conditions3     0.958261001
road_surface_conditions4     0.708252313
road_surface_conditions5     0.944823220
road_surface_conditions9     0.445159967
weather_conditions2         -0.069904189
weather_conditions3         -0.117426522
weather_conditions4          0.155343359
weather_conditions5         -0.040067944
weather_conditions6         -0.075081224
weather_conditions7         -0.284458507
weather_conditions8         -0.169295042
weather_conditions9         -0.383972781
urban_or_rural_area1        -0.112744643
urban_or_rural_area2         0.086384660
urban_or_rural_area3         0.973339463
special_conditions_at_site0 -0.306314034
special_conditions_at_site1 -0.507191456
special_conditions_at_site2  0.314806500
special_conditions_at_site3 -0.115356865
special_conditions_at_site4 -0.412954815
special_conditions_at_site5 -0.116574223
special_conditions_at_site6 -0.353961785
special_conditions_at_site7 -0.336745861
special_conditions_at_site9 -1.031307541
day_of_week2                -0.136292827
day_of_week3                -0.175948703
day_of_week4                -0.120039691
day_of_week5                -0.124894449
day_of_week6                -0.104736531
day_of_week7                -0.002913686
junction_detail0             0.091600324
junction_detail1            -0.453624650
junction_detail2            -0.188097911
junction_detail3             .          
junction_detail5            -0.201307564
junction_detail6             0.008137821
junction_detail7             .          
junction_detail8             .          
junction_detail9             0.001650131
junction_detail99           -1.819330521
speed_limit                  0.007150706
time_of_dayMorning          -0.318457391
time_of_dayAfternoon        -0.265087170
time_of_dayEvening          -0.113424581

Loading Libraries:

# Load required libraries

suppressWarnings({
 library(tidyverse)
library(ggplot2)
library(plotly)  
library(maps)
library(leaflet)
library(readr)

library(plotly)
library(dplyr)
library(viridis) 
library(rnaturalearth)  # For loading world map data


})

Average Suicidal Rates By Country from 1950 to 2022:

# Load the data (change the path to where your file is stored)
suicide_data <- read.csv("suicide-rates-all.csv")

# Summarize the data to get the average suicide rate for each country
average_suicide_rate <- suicide_data %>%
  group_by(Country) %>%
  summarise(Average_Suicide_Rate = mean(suscide.rate, na.rm = TRUE), .groups = "drop")  # Add .groups = "drop" to avoid warnings

# Load world map data
world <- ne_countries(scale = "medium", returnclass = "sf")

# Merge world map data with average suicide rates
world_data <- merge(world, average_suicide_rate, by.x = "name", by.y = "Country", all.x = TRUE)

# Create color palette for suicide rates
pal <- colorNumeric(palette = "YlOrRd", domain = world_data$Average_Suicide_Rate, na.color = "transparent")

# Create interactive leaflet map
leaflet(world_data) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~pal(Average_Suicide_Rate),
    weight = 1,
    opacity = 1,
    color = "white",
    dashArray = "3",
    fillOpacity = 0.7,
    highlight = highlightOptions(
      weight = 5,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~paste(name, "Average Suicide Rate: ", round(Average_Suicide_Rate, 2)),
    labelOptions = labelOptions(
      style = list("font-weight" = "normal", padding = "3px 8px"),
      textsize = "15px",
      direction = "auto"
    )
  ) %>%
  addLegend(pal = pal, values = ~Average_Suicide_Rate, opacity = 0.7, title = "Average Suicide Rate per 100,000", position = "bottomright") %>%
  addControl("<strong>Average Suicide Rates by Country of all the ages over the years 1950 to 2022</strong>", position = "topright", 
             className = "map-title")  # Add title to the map

Average Suicide Rates of all the ages by Country for the top rated year: 1982

# Load the data (change the path to where your file is stored)
suicide_data <- read.csv("suicide-rates-all.csv")

# Calculate the overall average suicide rate for each year across all countries
yearly_avg_suicide_rate <- suicide_data %>%
  group_by(Year) %>%
  summarise(Average_Suicide_Rate = mean(suscide.rate, na.rm = TRUE))

# Find the year with the highest overall average suicide rate
top_year <- yearly_avg_suicide_rate %>%
  filter(Average_Suicide_Rate == max(Average_Suicide_Rate)) %>%
  pull(Year)

# Filter the data to include only the data for the top year
top_year_data <- suicide_data %>%
  filter(Year == top_year)

# Load world map data
world <- ne_countries(scale = "medium", returnclass = "sf")

# Merge world map data with suicide rate data for the top year
world_data <- merge(world, top_year_data, by.x = "name", by.y = "Country", all.x = TRUE)

# Create color palette for the suicide rates in the top year
pal <- colorNumeric(palette = "YlOrRd", domain = world_data$suscide.rate, na.color = "transparent")

# Create interactive leaflet map
map <- leaflet(world_data) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~pal(suscide.rate),
    weight = 1,
    opacity = 1,
    color = "white",
    dashArray = "3",
    fillOpacity = 0.7,
    highlight = highlightOptions(
      weight = 5,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~paste(name, "Suicide Rate: ", round(suscide.rate, 2)),
    labelOptions = labelOptions(
      style = list("font-weight" = "normal", padding = "3px 8px"),
      textsize = "15px",
      direction = "auto"
    )
  ) %>%
  addLegend(pal = pal, values = ~suscide.rate, opacity = 0.7, title = paste("Suicide Rate per 100,000 in ", top_year), position = "bottomright")%>%
addControl("<strong>Average Suicide Rates of all the ages by Country for the top rated year: 1982</strong>", position = "topright", 
             className = "map-title")  # Add title to the map

# Show the map
map

Average Suicide Rates by Year all over the World:

# Read the data
data <- read_csv(
  "C:\\Users\\Venkata\\Desktop\\stat1\\Website project\\website\\suicide-rates-all.csv",
  col_types = cols(
    Country = col_character(),
    Code = col_character(),
    Year = col_double(),
    `suscide-rate` = col_double()
  )
)

# Step 1: Calculate the average suicide rate for each year
avg_suicide_by_year <- data %>%
  group_by(Year) %>%
  summarise(Average_suicide_rate = round(mean(`suscide-rate`, na.rm = TRUE), 2), .groups = "drop")

# Step 2: Create the frequency polygon
frequency_polygon <- ggplot(avg_suicide_by_year, aes(x = Year, y = Average_suicide_rate)) +
  geom_line(stat = "identity", color = "blue", size = 1) +  # Line to connect data points
  geom_point(color = "red") +  # Points for each year
  labs(title = "Suicide Rates trend of all the ages and Years all over the world", 
       x = "Year", 
       y = "Average Suicide Rate per 100,000") +
  
  theme(
    plot.title = element_text(face = "bold"), 
    axis.title.x = element_text(face = "bold"), 
    axis.title.y = element_text(face = "bold")
  )

# Convert the frequency polygon to an interactive plot
interactive_frequency_polygon <- ggplotly(frequency_polygon)

# Show the interactive frequency polygon
interactive_frequency_polygon

Top 5 Years with Highest Suicide Rates in Top 5 Countries:

# Load the data (update the path to your CSV file accordingly)
suicide_data <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Website project\\website\\suicide-rates-all.csv")

# Calculate the average suicide rate per country and filter top 5 countries
top_5_countries <- suicide_data %>%
  group_by(Country) %>%
  summarise(Average_Suicide_Rate = mean(suscide.rate, na.rm = TRUE)) %>%
  top_n(5, wt = Average_Suicide_Rate)

# Find the top 5 years for each of the top 5 countries
top_5_years_per_country <- suicide_data %>%
  filter(Country %in% top_5_countries$Country) %>%
  group_by(Country, Year) %>%
  summarise(Suicide_Rate = mean(suscide.rate, na.rm = TRUE)) %>%
  arrange(Country, desc(Suicide_Rate)) %>%
  group_by(Country) %>%
  slice_max(Suicide_Rate, n = 5)

# Create a horizontal bar plot with increased bar size
p_bar <- ggplot(top_5_years_per_country, aes(x = reorder(Country, Suicide_Rate), y = Suicide_Rate, fill = Year, text = paste("Country:", Country, "<br>Year:", Year, "<br>Suicide Rate:", round(Suicide_Rate, 2)))) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.9) +  # Adjust width to increase bar size
  scale_fill_gradient(low = "#FF9999", high = "#CC0000") +  # Light to dark red gradient
  labs(title = "Top 5 Countries with Highest Suicide Rates and Their Top 5 Years",
       x = "Suicide Rate",
       y = "Country") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none") +
  coord_flip() +  # Flip the axes to create horizontal bars
  
  # Add year labels to the bars
  geom_text(aes(label = Year), 
            position = position_dodge(width = 0.9),  # Ensure labels match bar positions
            hjust = -0.2,   # Adjust horizontal position of text
            size = 3,       # Set a smaller text size
            fontface = "bold")       # Set a smaller text size

# Convert ggplot to interactive plotly plot
interactive_bar_plot <- ggplotly(p_bar, tooltip = "text")

# Display the interactive plot
interactive_bar_plot

Average Suicide Rates for Ages 15-19 over the years:

# Load the CSV file
children_data <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Website project\\website\\suicide-rates-by-age.csv")

# Step 1: Rename columns to appropriate labels
colnames(children_data) <- c("Country", "Code", "Year", 
                              "Death_rate_15_19", "Death_rate_20_24", "Death_rate_25_29", 
                              "Death_rate_30_34", "Death_rate_35_39", "Death_rate_40_44", 
                              "Death_rate_45_49", "Death_rate_50_54", "Death_rate_55_59", 
                              "Death_rate_60_64", "Death_rate_65_69", "Death_rate_70_74", 
                              "Death_rate_75_79", "Death_rate_80_84", "Death_rate_over_85")

# Step 2: Calculate the average suicide rate for children (using 'Death_rate_15_19' as an example)
avg_suicide_rate <- children_data %>%
  group_by(Country) %>%
  summarise(AverageRate = round(mean(Death_rate_15_19, na.rm = TRUE),2))

# Step 3: Load map data
world_map <- map_data("world")

# Step 4: Merge average suicide rates with the map data
map_data <- world_map %>%
  left_join(avg_suicide_rate, by = c("region" = "Country")) # Ensure to match the correct column names

# Step 5: Create the static map using ggplot2
gg <- ggplot(data = map_data, aes(x = long, y = lat, group = group, fill = AverageRate)) +
  geom_polygon(color = "black", aes(text = paste("Country:", region, "<br>Average Suicide Rate (Ages 15-19):", round(AverageRate, 2)))) +
  scale_fill_gradient(low = "lightblue", high = "red", na.value = "grey50", 
                      name = "Average Suicide Rate\n(Ages 15-19)") +
  labs(title = "Average Suicide Rate of Children by Country (Ages 15-19)",
       x = "Longitude", 
       y = "Latitude") +
  theme_minimal() +
  theme(legend.position = "right")

# Step 6: Convert ggplot to an interactive plotly object
interactive_map <- ggplotly(gg, tooltip = "text")

# Step 7: Display the interactive map
interactive_map

Average Suicide Rate for the Top 20 Nations in the 15-19 Age Group (Across All Years):

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(plotly)

# Load the data
data <- read.csv("C:\\Users\\Venkata\\Desktop\\stat1\\Website project\\website\\suicide-rates-by-age.csv")

# Clean column names for easier access
colnames(data) <- c("Entity", "Code", "Year", 
                    "Death_rate_15_19", "Death_rate_20_24", "Death_rate_25_29", 
                    "Death_rate_30_34", "Death_rate_35_39", "Death_rate_40_44", 
                    "Death_rate_45_49", "Death_rate_50_54", "Death_rate_55_59", 
                    "Death_rate_60_64", "Death_rate_65_69", "Death_rate_70_74", 
                    "Death_rate_75_79", "Death_rate_80_84", "Death_rate_over_85")

# Filter the data for the 15-19 age group and remove missing values
data_filtered <- data %>%
  filter(!is.na(Death_rate_15_19))

# Calculate the average suicide rate for each country across all years
avg_suicide_rate <- data_filtered %>%
  group_by(Entity) %>%
  summarise(suscide_Rate = round(mean(Death_rate_15_19, na.rm = TRUE), 2)) %>%
  arrange(desc(suscide_Rate)) %>%
  top_n(20, suscide_Rate)

# Create the bar plot with ggplot
bar_plot <- ggplot(avg_suicide_rate, aes(x = reorder(Entity, suscide_Rate), 
                                           y = suscide_Rate, 
                                           fill = suscide_Rate,
                                           text = paste("Country:", Entity, "<br>Average Suicide Rate:", suscide_Rate))) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates to make country names readable
  scale_fill_gradient(low = "#ffcccc", high = "#990000") +  # Light red to dark red gradient
  labs(title = "Suicide Rates Among 15-19 Year Olds in the Top 20 Nations (All Years)", 
       x = "Country", 
       y = "Average Suicide Rate per 100,000 Population") +
  theme(axis.text.y = element_text(angle = 0, hjust = 1, size = 8)) +  # Adjust text angle and size
  guides(fill = guide_legend(title = "Suicide Rate per 100,000"))  # Change the legend title

# Convert to an interactive plot with plotly
interactive_plot <- ggplotly(bar_plot, tooltip = "text")  # Use the 'text' aesthetic for tooltip

# Show the interactive plot
interactive_plot